from eurostat_dataset import eurostat_dataset # packed in class the method to wrangle eurostat's table
import pandas as pd
import sqlite3
import plotly.express as px
import plotly.offline as pyo
import warnings
warnings.filterwarnings("ignore")
# The result can be printed to html by nbconvert. For this:
# - pip install nbconvert -U in the environment
# - jupyter nbconvert Eurostat.ipynb --no-input --to html --no-prompt - in the terminal
# to activate env: venv\Scripts\activate.bat
# python -m pip freeze > requirements.txt
# uncheck this comment to download dataset to the sqlite database
# eurostat_dataset(code='NRG_BAL_S').WriteToDatabase()
# eurostat_dataset(code='NRG_INF_EPCRW').WriteToDatabase()
# write to database, so you don't need to wait everytime when you restart the kernel or the file
#check the tables that has been written to the database:
def db_tables():
con = sqlite3.connect('ngr.db')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())
con.close
# read table from db to DataFrame
def read_df_from_db(table):
con = sqlite3.connect('ngr.db')
cursor = con.cursor()
df = pd.read_sql_query("SELECT * FROM "+ table , con)
con.close
df.drop(columns=['index'],inplace=True)
return df
tables = db_tables()
print(f'The list of tables in the local sqlite database {tables}') # we can see the tables in the database 'ngr.db'
[('nrg_bal_s',), ('nrg_inf_epcrw',)]
The list of tables in the local sqlite database None
renew = eurostat_dataset(code='nrg_inf_epcrw').DatasetInfo()
dataset_info=eurostat_dataset(code='NRG_BAL_S').DatasetInfo()
print(f'The first dataset: "{dataset_info[0]}" dates from {dataset_info[1]} to {dataset_info[2]}.')
print(f'The second dataset: "{renew[0]}" dates from {renew[1]} to {renew[2]}.')
The first dataset: "Simplified energy balances" dates from 1990 to 2020. The second dataset: "Electricity production capacities for renewables and wastes" dates from 1990 to 2020.
#reading the table of the first dataset from local database
bal = read_df_from_db('nrg_bal_s')
balance=bal.copy()
balance = balance.query('unit=="Gigawatt-hour" and siec=="Total" and nrg_bal=="Final consumption - energy use" \
and geo not in ("Euro area - 19 countries (from 2015)","European Union - 27 countries (from 2020)")')
get_energy = read_df_from_db('nrg_inf_epcrw')
Final consumption - energy use (Gigawatt-hour)
balance.head()
| nrg_bal | siec | unit | geo | date | value | |
|---|---|---|---|---|---|---|
| 5778 | Final consumption - energy use | Total | Gigawatt-hour | Albania | 2020 | 21474.051 |
| 5779 | Final consumption - energy use | Total | Gigawatt-hour | Austria | 2020 | 288624.871 |
| 5780 | Final consumption - energy use | Total | Gigawatt-hour | Bosnia and Herzegovina | 2020 | 46804.808 |
| 5781 | Final consumption - energy use | Total | Gigawatt-hour | Belgium | 2020 | 360985.528 |
| 5782 | Final consumption - energy use | Total | Gigawatt-hour | Bulgaria | 2020 | 110633.154 |
balance.value=pd.to_numeric(balance.value)
balance.date=pd.to_numeric(balance.date)
#I like plotly. It is more interactive)
# https://plotly.com/python/plotly-express/#gallery
pyo.init_notebook_mode() #trying to render the plot in github, but it is warking only in nbviewer!https://nbviewer.org/
fig = px.line(balance, x="date", y="value", color="geo",title="Final consumption - energy use,Gigawatt-hour")
pyo.iplot(fig)
#get population dataset and clean
pop = eurostat_dataset('TPS00001').GetDf()
population=pop.copy()
population['value'] = population.value.astype('str').fillna('').str.extract(r"(\d+)|(\d+)\.")[0].astype('Int32')
population['date'] = pd.to_numeric(population.date)
#merge with balance tables
m_balance = balance.merge(population[["geo","date","value"]],on=['geo',"date"],suffixes=("", "_population"))
m_balance = m_balance.assign(per_capita=lambda x: x.value*1000/x.value_population)
pyo.init_notebook_mode()
fig = px.line(m_balance, x="date", y="per_capita", color="geo",
title="Final consumption - energy use per capita,Megawatt-hour")
pyo.iplot(fig)
get_energy.head()
| siec | plant_tec | unit | geo | date | value | |
|---|---|---|---|---|---|---|
| 0 | Solid biofuels | Net maximum electrical capacity | Megawatt | Albania | 2020 | 0.000 |
| 1 | Solid biofuels | Net maximum electrical capacity | Megawatt | Austria | 2020 | 816.199 |
| 2 | Solid biofuels | Net maximum electrical capacity | Megawatt | Bosnia and Herzegovina | 2020 | 1.120 |
| 3 | Solid biofuels | Net maximum electrical capacity | Megawatt | Belgium | 2020 | 563.500 |
| 4 | Solid biofuels | Net maximum electrical capacity | Megawatt | Bulgaria | 2020 | 15.064 |
"Gross electricity production"
el_production = bal.query('unit=="Gigawatt-hour" and siec=="Total" and nrg_bal=="Gross electricity production" \
and geo not in ("Euro area - 19 countries (from 2015)","European Union - 27 countries (from 2020)")')
el_production.date=pd.to_numeric(el_production.date)
final_cons = balance[['geo','date','value']].rename(columns={'value':'Final consumption'}).set_index(['geo','date']).squeeze()
el_prod = el_production[['geo','date','value']].rename(columns={'value':'Electricity production'}).set_index(['geo','date']).squeeze()
renewables = bal.query('unit=="Gigawatt-hour" and siec=="Renewables and biofuels" and nrg_bal=="Gross electricity production" \
and geo not in ("Euro area - 19 countries (from 2015)","European Union - 27 countries (from 2020)")')
renewables.date=pd.to_numeric(renewables.date)
renSeries = renewables[['geo','date','value']].rename(columns={'value':'Electricity production from Renewables and biofuels'}).set_index(['geo','date']).squeeze()
ren_table = get_energy.query('siec in ("Solid biofuels","Biogases","Hydro","Geothermal","Wind","Solar") \
and geo not in ("Euro area - 19 countries (from 2015)","European Union - 27 countries (from 2020)")')
ren_table.date=pd.to_numeric(ren_table.date)
ren_table['perc'] = ren_table['value'] / ren_table.groupby(['geo','date'])['value'].transform('sum')
ren_table['max_perc'] = ren_table.groupby(['geo','date'])['perc'].transform(max)
reneable_source_name = ren_table.query('perc==max_perc')[['geo','date','siec']].drop_duplicates(subset=['geo','date'])\
.rename(columns={'siec':'Main Renewable Source'}).set_index(['geo','date']).squeeze()
reneable_source_perc = ren_table.query('perc==max_perc')[['geo','date','perc']].drop_duplicates(subset=['geo','date'])\
.rename(columns={'perc':'Main Renewable Source,perc'}).set_index(['geo','date']).squeeze()
concated = pd.concat([final_cons,el_prod,renSeries,reneable_source_name,reneable_source_perc],axis=1).reset_index()
concated['Electricity production'] = pd.to_numeric(concated['Electricity production'])
concated['Electricity production from Renewables and biofuels'] = pd.to_numeric(concated['Electricity production from Renewables and biofuels'])
concated["Electricity production from Final consumption,perc"] = concated['Electricity production']/concated['Final consumption']
concated["Electricity production from Renewables and biofuels,perc"] = concated['Electricity production from Renewables and biofuels']/concated['Electricity production']
cols = concated.columns.to_list()
new_cols = cols[:5]+cols[7:9]+cols[5:7]
concated = concated[new_cols]
concated
| geo | date | Final consumption | Electricity production | Electricity production from Renewables and biofuels | Electricity production from Final consumption,perc | Electricity production from Renewables and biofuels,perc | Main Renewable Source | Main Renewable Source,perc | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Albania | 2020 | 21474.051 | 5313.166 | 5313.166 | 0.247423 | 1.000000 | Hydro | 0.991279 |
| 1 | Austria | 2020 | 288624.871 | 72556.246 | 58779.511 | 0.251386 | 0.810123 | Hydro | 0.700284 |
| 2 | Bosnia and Herzegovina | 2020 | 46804.808 | 16874.000 | 4983.000 | 0.360519 | 0.295306 | Hydro | 0.947741 |
| 3 | Belgium | 2020 | 360985.528 | 88890.800 | 24458.500 | 0.246245 | 0.275152 | Solar | 0.448351 |
| 4 | Bulgaria | 2020 | 110633.154 | 40731.058 | 7977.586 | 0.368163 | 0.195860 | Hydro | 0.646253 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1235 | Slovakia | 1990 | 161007.115 | 26132.000 | 2515.000 | 0.162303 | 0.096242 | NaN | NaN |
| 1236 | Türkiye | 1990 | 438615.366 | 57543.000 | 23228.000 | 0.131192 | 0.403663 | Hydro | 0.997346 |
| 1237 | Ukraine | 1990 | 1671235.901 | 298835.000 | 10723.000 | 0.178811 | 0.035883 | NaN | NaN |
| 1238 | United Kingdom | 1990 | 1474621.763 | 319737.000 | 7793.000 | 0.216826 | 0.024373 | Hydro | 0.974981 |
| 1239 | Kosovo (under United Nations Security Council ... | 1990 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1240 rows × 9 columns
pyo.init_notebook_mode()
fig = px.line(concated, x="date", y="Electricity production from Renewables and biofuels,perc", color="geo",
title="Electricity production from Renewables and biofuels,perc")
pyo.iplot(fig)
pyo.init_notebook_mode()
fig = px.line(concated, x="date", y="Electricity production from Renewables and biofuels,perc", color="geo",
title="Electricity production from Renewables and biofuels,perc")
pyo.iplot(fig)